Key Things to Remember

ggplot2 provides unlimited visualization customization

What to Remember from this Section

ggplot2 allows you to:

  1. Think about your graphics via layers
  2. Apply an organized grammar of graphics for visualization development

What to Remember from this Section

Syntax for ggplot2 visualizations will look like:

library(ggplot2)

ggplot(data = mpg, aes(x = displ, y = hwy)) +
        geom_point(aes(color = class)) +
        xlab("Engine Displacement (liters)") +
        ylab("Highway Mileage") +
        ggtitle("Fuel Economy vs. Engine Displacement") +
        theme_bw()

Resources Used…

Load these packages to follow along with this tutorial

library(ggplot2)
library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union


Import the following data sets from the data folder

supermarket <- read_excel("data/Supermarket Transactions.xlsx", sheet = "Data")
facebook <- read.delim("data/facebook.tsv")
reddit <- read.csv("data/reddit.csv")
race <- read.csv("data/race-comparison.csv")

Fundamentals of ggplot2

First Layer is, well, boring

ggplot(data = supermarket)
ggplot(data = supermarket, aes(x = `Purchase Date`, y = Revenue))
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine

Let's Add Geoms

To display the data we need to tell ggplot what to draw

geom_histogram() histogram
geom_freqpoly() frequency polygon
geom_bar() bar chart
geom_point() scatter plot
geom_line() line chart
geom_boxplot() boxplot


Check out all the available geoms at docs.ggplot2.org/current

Univariate Geoms

Histogram, Frequency Polygons & Density Plots

ggplot(data = supermarket, aes(x = Revenue)) +
        geom_histogram()

ggplot(data = supermarket, aes(x = Revenue)) +
        geom_freqpoly()

ggplot(data = supermarket, aes(x = Revenue)) +
        geom_density()

Histogram, Frequency Polygons & Density Plots

ggplot(data = supermarket, aes(x = Revenue)) +
        geom_histogram(bins = 100, color = "grey40", fill = "white")

ggplot(data = supermarket, aes(x = Revenue)) +
        geom_freqpoly(bins = 100, color = "blue")

ggplot(data = supermarket, aes(x = Revenue)) +
        geom_density(fill = "red", alpha = .5)

Bar Chart

ggplot(data = supermarket, aes(x = `Product Family`)) +
        geom_bar()

summary <- supermarket %>%
        group_by(`Product Family`) %>%
        tally()

ggplot(data = summary, aes(x = `Product Family`, y = n)) +
        geom_bar(stat = "identity")

Bar Chart

ggplot(data = supermarket, aes(x = `Product Family`)) +
        geom_bar(fill = "dodgerblue", color = "grey40")

ggplot(data = supermarket, aes(x = `Product Family`)) +
        geom_bar(fill = "dodgerblue", color = "grey40", width = .75)

ggplot(data = supermarket, aes(x = `Product Family`)) +
        geom_bar(fill = "dodgerblue", color = "grey40", width = .99)

Your Turn



1. Assess the distribution of age, tenure, and gender in the facebook data.


2. Assess the frequency of age range, education, and income range in the reddit data.

Bivariate Geoms

Scatter Plots

ggplot(supermarket, aes(`Purchase Date`, Revenue)) +
        geom_point()

ggplot(supermarket, aes(`Purchase Date`, Revenue)) +
        geom_point(colour = "blue", size = 1, shape = 5)

ggplot(supermarket, aes(`Purchase Date`, Revenue)) +
        geom_point(colour = "blue", alpha = .25)

Scatter Plots: over plotting

ggplot(supermarket, aes(factor(`Units Sold`), Revenue)) +
        geom_point()

ggplot(supermarket, aes(factor(`Units Sold`), Revenue)) +
        geom_jitter(size = 1)

ggplot(supermarket, aes(factor(`Units Sold`), Revenue)) +
        geom_jitter(size = 1, alpha = .1)

Line Charts

sales_by_date <- supermarket %>%
        group_by(`Purchase Date`) %>%
        summarise(Revenue = sum(Revenue, na.rm = TRUE))

ggplot(sales_by_date, aes(`Purchase Date`, Revenue)) +
        geom_line()

Line Charts: fitting additional lines

sales_plot <- ggplot(sales_by_date, aes(`Purchase Date`, Revenue)) +
        geom_line()

sales_plot + geom_smooth(span = .1)

sales_plot + geom_smooth(span = .9. se = FALSE)

sales_plot + geom_smooth(method = "lm", se = FALSE)

Box Plot

ggplot(supermarket, aes(factor(Children), Revenue)) +
        geom_boxplot()

ggplot(supermarket, aes(factor(Children), Revenue)) +
        geom_boxplot(notch = TRUE, fill = "blue", alpha = .25)

ggplot(supermarket, aes(factor(Children), Revenue)) +
        geom_boxplot(outlier.color = "red", outlier.shape = 1)

Box Plot: over plotting

Useful for smaller data sets like mpg

ggplot(mpg, aes(class, hwy)) +
        geom_boxplot()

ggplot(mpg, aes(class, hwy)) +
        geom_boxplot() +
        geom_jitter(width = .2, alpha = .5)

ggplot(mpg, aes(class, hwy)) +
        geom_violin()

Bar Chart

Bar charts can have a y-axis different than just counts

ggplot(supermarket, aes(x = `Product Family`)) +
        geom_bar()

prod_revenue <- supermarket %>%
        group_by(`Product Family`) %>%
        summarise(Revenue = sum(Revenue, na.rm = TRUE))

ggplot(prod_revenue, aes(x = `Product Family`, y = Revenue)) +
        geom_bar(stat = "identity")

Your Turn



Assess the relationship between tenure and age, gender, likes, etc. in the facebook data.

Multivariate Capabilities

Color, Size, Shape, etc.

ggplot(supermarket, aes(Revenue, color = `Product Family`)) +
        geom_freqpoly()

ggplot(data = supermarket, aes(`Product Family`, fill = Gender)) +
        geom_bar(position = "dodge")

ggplot(supermarket, aes(`Purchase Date`, Revenue, color = Country)) +
        geom_point()

Color, Size, Shape, etc.

prod_revenue <- supermarket %>%
        group_by(`Purchase Date`, `Product Family`) %>%
        summarise(Revenue = sum(Revenue, na.rm = TRUE))

ggplot(prod_revenue, aes(`Purchase Date`, Revenue, color = `Product Family`)) +
        geom_line(alpha = .2) +
        geom_smooth(se = FALSE, span = .1)

Facetting

ggplot(prod_revenue, aes(`Purchase Date`, Revenue)) +
        geom_line(alpha = .2) +
        geom_smooth(se = FALSE, span = .1) +
        facet_wrap(~ `Product Family`)

Facetting

ggplot(prod_revenue, aes(`Purchase Date`, Revenue)) +
        geom_blank() +
        facet_grid(.~ `Product Family`)

ggplot(prod_revenue, aes(`Purchase Date`, Revenue)) +
        geom_blank() +
        facet_grid(`Product Family` ~.)

Visualization Aesthetics

Scales, Axes and Legends

Themes